# import math
import random
import csv
import urllib2
# from collections import Counter
import numpy as np
import pandas as pd
from pandas.tools.plotting import scatter_matrix
# from scipy import stats, integrate
import seaborn as sns # pip install seaborn
sns.set_style("white") # For seaborn to show axes in iPython Notebook
import matplotlib.pyplot as plt
from ggplot import *
# from matplotlib import cm
# For printing plots in iPython Notebook cells
%matplotlib inline
n = 21 # Number of number of rows (cases or items)
m = 5 # Number of columns (variables)
headers = ['names', 'height', 'weight', 'age']
names = ['John','Jane','Jack','Jenny','Andy','Anne','Bill','Pete','Mary','Sarah','Katy','Brian','David','Helen','Mike',
'Ruth','Alex','Keith','Linda','Dawn']
height = [x/100. for x in range(159,198,2)]
random.shuffle(height)
weight = [random.randrange(500,1000,3)/10. for _ in range(20)]
age =[random.choice(range(27,67)) for _ in range(20)]
rdata = []
for i in range(20):
rdata.append([names[i],height[i],weight[i],age[i]])
dfrdata = pd.DataFrame(rdata, columns=headers)
dfrdata
print dfrdata['weight'], type(dfrdata['weight'])
print dfrdata['weight'].tolist(), type(dfrdata['weight'].tolist())
print list(dfrdata['weight'])
list(dfrdata['weight']) == dfrdata['weight'].tolist()
print dfrdata.names.unique()
print dfrdata.age.unique()
print dfrdata.names.unique().tolist()
print dfrdata.age.unique().tolist()
print pd.unique(dfrdata.names.ravel())
print pd.unique(dfrdata.age.ravel())
print pd.unique(dfrdata.names.ravel()).tolist()
print pd.unique(dfrdata.age.ravel()).tolist()
dfrdata.to_csv('data1.csv', sep='\t', index=False) # sep = ';','\t'
# dfrdata.to_csv('data1.csv', sep='\t', encoding='utf-8')
dfrdata1 = pd.read_csv('data1.csv', sep='\t')
dfrdata1
dfrdata2 = pd.DataFrame(np.random.randn(10,6))
print dfrdata2.columns
dfrdata2
dfrdata2 = dfrdata2.rename(columns={0:'Var0',1:'Var1',2:'Var2',3:'Var3',4:'Var4',5:'Var5',})
dfrdata2
print dfrdata2['Var2'], type(dfrdata2['Var2'])
print dfrdata2['Var2'].tolist(), type(dfrdata2['Var2'].tolist())
print list(dfrdata2['Var2'])
list(dfrdata2['Var2']) == dfrdata2['Var2'].tolist()
dfrdata2_vars = []
for i in dfrdata2.columns:
dfrdata2_vars.append(list(dfrdata2[i]))
# dfrdata2_vars
datadict = {'occupation': ['painter','actor','author','singer','student','manager','singer'],
'city': ['Athens','Paris','London','Rome','Tokyo','New York','Lima'],
'year': [1980,1998,2003,1977,1992,1977,1985],
'reports': [5,14,22,8,31,26,17]}
for key, value in datadict.items():
print key, 'has length', len(value)
ddf = pd.DataFrame(datadict, index = range(7))
print ddf.shape
ddf
print ddf.city.unique()
print ddf.occupation.unique()
print ddf.reports.unique()
print ddf.year.unique()
# print pd.unique(ddf.city.ravel())
# print pd.unique(ddf.occupation.ravel())
# print pd.unique(ddf.reports.ravel())
# print pd.unique(ddf.year.ravel())
mtcars = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/mtcars.csv")
print mtcars.shape
print mtcars.columns
mtcars
mtcars = mtcars.rename(columns={'Unnamed: 0': 'Cars'})
mtcars.columns
diamonds = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv")
print diamonds.shape
print diamonds.info()
diamonds = diamonds.rename(columns={'Unnamed: 0': '#'})
print diamonds.columns
diamonds.isnull().any().any() # Check for missing values
diamonds.head(5)
# diamonds.head(n=5)
diamonds.iloc[range(5)]
diamonds.ix[range(5)]
diamonds.iloc[range(1000,1005)]
diamonds.ix[range(1000,1005)]
diamonds.iloc[range(53940)[-5:]]
diamonds.ix[range(53940)[-5:]]
diamonds.tail(5)
print diamonds.cut.unique()
print diamonds.color.unique()
print diamonds.clarity.unique()
# print pd.unique(diamonds.cut.ravel())
# print pd.unique(diamonds.color.ravel())
# print pd.unique(diamonds.clarity.ravel())
anscombe = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/anscombe.csv")
print anscombe.shape
print anscombe.columns
anscombe
tips = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/reshape2/tips.csv")
print tips.shape
print tips.columns
tips.head(n=4)
print tips.sex.unique()
print tips.smoker.unique()
print tips.day.unique()
print tips.time.unique()
# print pd.unique(tips.sex.ravel())
# print pd.unique(tips.smoker.ravel())
# print pd.unique(tips.day.ravel())
# print pd.unique(tips.time.ravel())
dfrn = pd.DataFrame(np.random.randn(10,4))
dfrn.iloc[1:4,0] = np.nan
dfrn.iloc[5,2] = np.nan
dfrn.iloc[7:9,3] = np.nan
print dfrn.shape
dfrn
dfrn = pd.DataFrame(np.random.randn(10,4))
dfrn.iloc[1:4,0] = None
dfrn.iloc[5,2] = None
dfrn.iloc[7:9,3] = None
print dfrn.shape
dfrn
print dfrn.isnull().any().any()
print dfrn.isnull().any()
print dfrn.isnull().sum().sum()
dfrn.isnull().sum()
dfrn = dfrn.dropna()
print dfrn.isnull().any().any()
dfrn.isnull().sum().sum()
print dfrn.shape
dfrn
airquality = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/airquality.csv")
print airquality.shape
print airquality.columns
# airquality
print airquality.isnull().any().any()
print airquality.isnull().any()
print airquality.isnull().sum().sum()
airquality.isnull().sum()
airquality = airquality.dropna()
print airquality.isnull().any().any()
print airquality.isnull().sum().sum()
print airquality.shape
print airquality.columns
airquality.head(n=4)
brainsize = pd.read_csv("brain_size.csv",sep=';')
print brainsize.shape
print brainsize.isnull().any().any()
print brainsize.isnull().any()
print brainsize.isnull().sum().sum()
brainsize.isnull().sum()
brainsize
df = brainsize
df['Weight'].replace('.', np.nan, inplace=True)
df['Height'].replace('.', np.nan, inplace=True)
brainsize = df.dropna()
print brainsize.isnull().any().any()
print brainsize.isnull().sum().sum()
print brainsize.shape
brainsize
Η συνδιακύμανση δυο μεταβλητών $x = (x_1, x_2, \ldots, x_n)$ και $y = (y_1, y_2, \ldots, y_n)$ με μέσες τιμές $\bar{x}$ και $\bar{y}$, αντιστοίχως, ορίζεται ως
\begin{equation} \frac{1}{n} \sum_{i = 1}^n (x_i - \bar{x}) (y_i - \bar{y}) \end{equation}Η δειγματική συνδιακύμανση των μεταβλητών αυτών ορίζεται ως
\begin{equation} \frac{1}{n - 1} \sum_{i = 1}^n (x_i - \bar{x}) (y_i - \bar{y}) \end{equation}def covariance(x,y):
return sum([(xi - np.mean(x))*(yi - np.mean(y)) for (xi, yi) in zip(x,y)])/float(len(x))
def scovariance(x,y):
return sum([(xi - np.mean(x))*(yi - np.mean(y)) for (xi, yi) in zip(x,y)])/float(len(x)-1)
x = dfrdata2_vars[1]
y = dfrdata2_vars[3]
print covariance(x,y), scovariance(x,y)
print np.cov(x,y)[1,0]
print np.cov(x,y)
dfrdata2.cov()
print mtcars[['hp','wt']].cov()
print 'cov(hp,wt) =', mtcars[['hp','wt']].cov().hp[1]
mtcars.cov()
print diamonds[['table','y']].cov()
print 'cov(y,table) =', diamonds[['table','y']].cov().y[0]
diamonds.cov()
Η συσχέτιση (δειγματική συσχέτιση) δυο μεταβλητών ορίζεται ως η (δειγματική) συνδιακύμανσή τους διαιρούμενη με το γινόμενο των (δειγματικών) τυπικών αποκλίσεών τους.
print dfrdata2[['Var1','Var4']].corr()
print 'corr(Var1,Var4) =', dfrdata2[['Var1','Var4']].corr().Var1[1]
print 'corr(Var1,Var4) =', dfrdata2[['Var1','Var4']].corr().Var1[0]
dfrdata2.corr()
print mtcars[['hp','wt']].corr()
print 'corr(hp,wt) =', mtcars[['hp','wt']].corr().wt[0]
mtcars.corr()
print diamonds[['table','y']].corr()
print 'corr(table,y) =', diamonds[['table','y']].corr().table[1]
diamonds.corr()
tips.info()
# Find categorical variables (columns):
tips.applymap(lambda x: isinstance(x, str)).any()
cv_tips = ['sex','smoker','day','time']
print cv_tips
ld_tips = {}
for i in tips.columns.tolist():
ld_tips[i] = len(tips[i].unique())
print ld_tips
pd.crosstab(tips['size'], tips.day)
pd.crosstab(tips['size'], [tips.sex, tips.smoker, tips.time])
sizedf = pd.crosstab(tips['size'], tips.day)
sexdf = pd.crosstab(tips['size'], tips.sex)
smokerdf = pd.crosstab(tips['size'], tips.smoker)
timedf = pd.crosstab(tips['size'], tips.time)
sizedf.join([sexdf,smokerdf,timedf])
pd.crosstab(tips['size'], tips.tip)
mtcars.info()
# Find categorical variables (columns):
mtcars.applymap(lambda x: isinstance(x, str)).any()
cv_mtcars = ['Cars']
ld_mtcars = {}
for i in mtcars.columns.tolist():
ld_mtcars[i] = len(mtcars[i].unique())
print ld_mtcars
pd.crosstab(mtcars.Cars, mtcars.carb).head(5)
pd.crosstab(mtcars.Cars, [mtcars.cyl, mtcars.vs, mtcars.am, mtcars.gear]).head(5)
import copy
mtcars1 = copy.deepcopy(mtcars)
mtcars1['cyl'].replace(to_replace=mtcars1['cyl'].unique().tolist(),value=['cyl4','cyl6','cyl8'],inplace=True)
mtcars1['vs'].replace(to_replace=mtcars1['vs'].unique().tolist(),value=['vs0','vs1'],inplace=True)
mtcars1['am'].replace(to_replace=mtcars1['am'].unique().tolist(),value=['am0','am1'],inplace=True)
mtcars1['gear'].replace(to_replace=mtcars1['gear'].unique().tolist(),value=['gear3','gear4','gear5'],inplace=True)
mtcars1['carb'].replace(to_replace=mtcars1['carb'].unique().tolist(),value=['carb1','carb2','carb3','carb4','carb6','carb8'],inplace=True)
carbdf = pd.crosstab(mtcars1['Cars'], mtcars1.carb)
vsdf = pd.crosstab(mtcars1['Cars'], mtcars1.vs)
amdf = pd.crosstab(mtcars1['Cars'], mtcars1.am)
cyldf = pd.crosstab(mtcars1['Cars'], mtcars1.cyl)
carbdf.join([vsdf,amdf,cyldf])
diamonds.info()
# Find categorical variables (columns):
diamonds.applymap(lambda x: isinstance(x, str)).any()
cv_diamonds = ['cut','color','clarity']
ld_diamonds = {}
for i in diamonds.columns.tolist():
ld_diamonds[i] = len(diamonds[i].unique())
print ld_diamonds
pd.crosstab(diamonds.price, diamonds.clarity).head(5)
pd.crosstab(diamonds.price, diamonds.cut).head(5)
pd.crosstab(diamonds.price, diamonds.color).head(5)
cutdf = pd.crosstab(diamonds.price, diamonds.cut)
colordf = pd.crosstab(diamonds.price, diamonds.color)
claritydf = pd.crosstab(diamonds.price, diamonds.clarity)
cutdf.join([colordf,claritydf]).head(5)
cv_tips
tips.groupby('sex').describe()
tips.groupby('sex').mean()
tips['total_bill'].groupby(tips['time']).describe()
tips['total_bill'].groupby(tips['time']).describe().unstack()
tips['total_bill'].groupby(tips['time']).std()
cv_mtcars
mtcars.groupby('Cars').describe().head(8)
mtcars.groupby('Cars').max().head(3)
mtcars['mpg'].groupby(mtcars['qsec']).describe().unstack().head(3)
mtcars['mpg'].groupby(mtcars['qsec']).max().head(3)
cv_diamonds
diamonds.groupby('color').describe().head(8)
diamonds.groupby('color').min().head(3)
diamonds['price'].groupby(diamonds['color']).describe().unstack().head(3)
diamonds['price'].groupby(diamonds['clarity']).mean().head(3)
# Find numerical variables
print set(tips.columns).difference(set(cv_tips))
tips.drop('Unnamed: 0',inplace=True,axis=1)
tips.plot(figsize=(10,10))
tips.plot(kind="area", figsize=(10,10), stacked=False)
tips.plot(kind="area", figsize=(10,10), stacked=True)
ax = tips.plot(kind="bar", figsize=(10,10), stacked=False)
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax = tips.plot(kind="bar", figsize=(10,10), stacked=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
tips.plot(kind="hist", figsize=(10,10), stacked=True)
tips.plot(kind="kde", xlim=(-3,63), figsize=(10,10))
tips['total_bill'].plot.kde(xlim=(-3,63), figsize=(10,10))
# Find numerical variables
cv_mtcars = ['Cars']
print set(mtcars.columns).difference(set(cv_mtcars))
mtcars.plot(figsize=(10,10))
mtcars.plot(kind="area", figsize=(10,10), stacked=False)
mtcars.plot(kind="area", figsize=(10,10), stacked=True)
mtcars.plot(kind="bar", figsize=(10,10), stacked=False)
mtcars.plot(kind="bar", figsize=(10,10), stacked=True)
mtcars.plot(kind="hist", figsize=(10,10), stacked=True)
# mtcars.describe()
# mtcars.plot(kind="kde", figsize=(10,10))
mtcars.plot(kind="kde", xlim=(-50,501), figsize=(10,10))
mtcars[['vs','am']].plot.kde(figsize=(10,10))
# Find numerical variables
print set(diamonds.columns).difference(set(cv_diamonds))
diamonds.drop('#',inplace=True,axis=1)
diamonds.plot(figsize=(10,10))
diamonds.describe()
diamonds.plot(kind="area", figsize=(10,10), stacked=False)
diamonds.plot(kind="area", figsize=(10,10), stacked=True)
diamonds_sample = diamonds.sample(100)
ax = diamonds_sample.plot(kind="bar", figsize=(10,10), stacked=False)
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax = diamonds_sample.plot(kind="bar", figsize=(10,10), stacked=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
diamonds_sample1 = diamonds[['carat','depth','table','x','y','z']].sample(100)
ax = diamonds_sample1.plot(kind="bar", figsize=(10,10), stacked=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
diamonds_sample.plot(kind="hist", figsize=(10,10), stacked=True)
diamonds_sample1.describe()
diamonds_sample1.plot(kind="kde", figsize=(10,10))
# diamonds_sample1.plot(kind="kde", xlim=(-50,501), figsize=(10,10))
diamonds_sample1[['carat','z']].plot.kde(figsize=(10,10))
print ld_tips
cv_tips
pd.crosstab(tips.day, tips.time)
plt.figure(figsize=(5,7))
sns.heatmap(pd.crosstab(tips.day, tips.time))
pd.crosstab(tips.day, tips.time).plot(kind="area", figsize=(10,10), stacked=False)
pd.crosstab(tips.day, tips.time).plot(kind="area", figsize=(10,10), stacked=True)
pd.crosstab(tips.day, tips.time).plot(kind="bar", figsize=(10,10), stacked=False)
pd.crosstab(tips.day, tips.time).plot(kind="bar", figsize=(10,10), stacked=True)
pd.crosstab(tips.day, tips.time).plot(kind="hist", figsize=(10,10), stacked=True)
print ld_mtcars
cv_mtcars
pd.crosstab(mtcars.cyl,mtcars.gear)
plt.figure(figsize=(6,5))
sns.heatmap(pd.crosstab(mtcars.cyl,mtcars.gear))
pd.crosstab(mtcars.cyl,mtcars.gear).plot(kind="area", figsize=(10,10), stacked=False)
pd.crosstab(mtcars.cyl,mtcars.gear).plot(kind="area", figsize=(10,10), stacked=True)
pd.crosstab(mtcars.cyl,mtcars.gear).plot(kind="bar", figsize=(10,10), stacked=False)
pd.crosstab(mtcars.cyl,mtcars.gear).plot(kind="bar", figsize=(10,10), stacked=True)
pd.crosstab(mtcars.cyl,mtcars.gear).plot(kind="hist", figsize=(10,10), stacked=True)
print ld_diamonds
cv_diamonds
pd.crosstab(diamonds.clarity,diamonds.color)
plt.figure(figsize=(12,10))
sns.heatmap(pd.crosstab(diamonds.clarity,diamonds.color))
pd.crosstab(diamonds.clarity,diamonds.color).plot(kind="area", figsize=(10,10), stacked=False)
pd.crosstab(diamonds.clarity,diamonds.color).plot(kind="area", figsize=(10,10), stacked=True)
pd.crosstab(diamonds.clarity,diamonds.color).plot(kind="bar", figsize=(10,10), stacked=False)
pd.crosstab(diamonds.clarity,diamonds.color).plot(kind="bar", figsize=(10,10), stacked=True)
pd.crosstab(diamonds.clarity,diamonds.color).plot(kind="hist", figsize=(10,10), stacked=True)
df = pd.DataFrame(np.random.randn(5000, 2), columns=['a', 'b'])
df['b'] = df['b'] + np.arange(5000)
df.plot.scatter(x='a', y='b', figsize=(10,10))
sns.jointplot(x="a", y="b", data=df, space=0, size=10)
df.plot.hexbin(x='a', y='b', gridsize=30, figsize=(12,10))
sns.jointplot("a", "b", data=df, kind="hex", gridsize=30, space=0, size=10)
sns.jointplot("a", "b", data=df, kind="kde", space=0, size=10)
# Find numerical variables
print set(tips.columns).difference(set(cv_tips))
ld_tips
for i in tips.columns:
print i, len(tips[i].unique())
tips.head(5)
tips.plot.scatter(x='total_bill',y='tip',figsize=(10,10))
sns.jointplot(x="total_bill", y="tip", data=tips, space=0, size=10)
tips.plot.hexbin(x='total_bill', y='tip', gridsize=25, figsize=(12,10))
sns.jointplot("total_bill", "tip", data=tips, kind="hex", gridsize=25, space=0, size=10)
sss=scatter_matrix(tips[['total_bill','tip','size']], alpha=0.9, figsize=(12, 12), color='black', diagonal='hist')
sns.pairplot(tips[['total_bill','tip','size']],kind='scatter',diag_kind='hist',size=3.5)
sns.pairplot(tips, hue="day", size=3.5)
# Find numerical variables
print set(mtcars.columns).difference(set(cv_mtcars))
ld_mtcars
mtcars.plot.scatter(x='wt',y='qsec',figsize=(10,10))
ax=mtcars.plot.scatter(x='mpg',y='hp',label='hp per mpg',figsize=(10,10),color='blue')
mtcars.plot.scatter(x='mpg',y='disp',label='disp per mpg',figsize=(10,10),ax=ax,color='red')
ax=mtcars.plot.scatter(x='mpg',y='qsec',label='qsec per mpg',figsize=(10,10),color='green')
mtcars.plot.scatter(x='mpg',y='wt',label='wt per mpg',figsize=(10,10),ax=ax,color='black')
mtcars.plot.scatter(x='mpg',y='hp', figsize=(10,10))
sns.jointplot(x="mpg", y="hp", data=mtcars, space=0, size=10)
mtcars.plot.hexbin(x='mpg', y='hp', gridsize=25, figsize=(12,10))
sns.jointplot("mpg", "hp", data=mtcars, kind="hex", gridsize=25, space=0, size=10)
sns.jointplot("mpg", "hp", data=mtcars, kind="kde", space=0, size=10)
mtcars.plot.scatter(x='mpg', y='hp', c='qsec', s=50, figsize=(12,10))
mtcars.plot.scatter(x='mpg', y='hp', s=mtcars['qsec']*20, figsize=(10,10))
mtcars.plot.hexbin(x='mpg', y='hp', C='qsec', reduce_C_function=np.max, gridsize=25, figsize=(12,10))
mtcars.plot.hexbin(x='drat', y='wt', C='qsec', reduce_C_function=np.max, gridsize=25, figsize=(12,10))
sss= scatter_matrix(mtcars[['mpg','hp','carb','disp']], alpha=0.9, figsize=(12, 12), color='black', diagonal='kde')
sns.pairplot(mtcars[['mpg','hp','carb','disp']],kind='scatter',diag_kind='kde')
mtcars1 = mtcars[['mpg','hp','carb','disp','qsec']]
sns.pairplot(mtcars1, hue="qsec")
# Find numerical variables
print set(diamonds.columns).difference(set(cv_diamonds))
ld_diamonds
diamonds.plot.scatter(x='price', y='carat', figsize=(10,10))
sns.jointplot(x="price", y="carat", data=diamonds, space=0, size=10)
diamonds.plot.hexbin(x='price', y='carat', gridsize=50, figsize=(12,10))
ax=diamonds.plot.hexbin(x='price', y='carat', gridsize=50, figsize=(12,10))
ax.set_xlim(0,4000)
ax.set_ylim(0,1)
sns.jointplot("price", "carat", data=diamonds, kind="hex", gridsize=50, space=0, size=10)
g=sns.jointplot("price", "carat", data=diamonds, kind="hex", gridsize=50, space=0, size=10)
g.ax_joint.set(xlim=(0, 4000), ylim=(0, 1))
sns.jointplot("price", "carat", data=diamonds, kind="kde", space=0, size=10)
g=sns.jointplot("price", "carat", data=diamonds, kind="kde", space=0, size=10)
g.ax_joint.set(xlim=(0, 4000), ylim=(0, 1))
p = ggplot(aes(x='price', y='carat',color="clarity"), data=diamonds)
p + geom_point()
p = ggplot(aes(x='price', y='carat',color="cut"), data=diamonds)
p + geom_point()
sss = scatter_matrix(diamonds[['carat','depth','table','price']], alpha=0.9, figsize=(12, 12), color='black', diagonal='hist')
sns.pairplot(diamonds[['carat','depth','table','price']],kind='scatter',diag_kind='hist',size=2.8)
diamonds1 = diamonds[['carat','depth','table','price','clarity']]
sns.pairplot(diamonds1, hue="clarity", size=2.8)
iris = sns.load_dataset("iris")
iris.shape
iris.info()
# Find categorical/numerical variables (columns):
print iris.applymap(lambda x: isinstance(x, str)).any()
cv_iris = ['species']
print 'Numerical variables:', list(set(iris.columns).difference(set(cv_iris)))
ld_iris = {}
for i in iris.columns.tolist():
ld_iris[i] = len(iris[i].unique())
print ld_iris
g = sns.pairplot(iris)
g = sns.pairplot(iris, hue="species")
g = sns.pairplot(iris, hue="species", palette="husl")
g = sns.pairplot(iris, hue="species", markers=["o", "s", "D"])